wine_rating = read_csv("./wine_rating.csv")
price_country=
wine_rating |>
group_by(country)|>
summarise(avg_by_country = mean(price)) |>
filter(!is.na(avg_by_country))
world_map_data <-
price_country |>
mutate(text = paste(country, "<br>Avg Price: $", round(avg_by_country, 2)))
fig_country_all <- plot_ly(
data = world_map_data,
type = "choropleth",
locations = ~country,
locationmode = "country names",
z = ~avg_by_country,
text = ~text,
colorscale = "Viridis"
)
fig_country_all <-
fig_country_all|>
layout(
geo = list(
showframe = FALSE,
projection = list(type = 'mercator')
),
title = "Average Wine Price by Country"
)
fig_country_all
price_country <-
price_country |>
arrange(desc(avg_by_country))
top_10_country =
head(price_country, 10) |>
mutate(rank = 1:10) |>
transform(rank = 1:10)
country10_map_data <- top_10_country |>
mutate(text = paste("Rank: ", rank, "<br>Country: ", country, "<br>Avg Price: $", round(avg_by_country, 2)))
fig_country_10map <- plot_ly(
data = country10_map_data,
type = "choropleth",
locations = ~country,
locationmode = "country names",
z = ~avg_by_country,
text = ~text,
colorscale = "YlGnBu"
)
fig_country_10map <-
fig_country_10map |>
layout(
geo = list(
showframe = FALSE,
projection = list(type = 'mercator')
),
title = "Top 10 Countries Based on Average Wine Price"
)
fig_country_10map
price_region=
wine_rating |>
group_by(country,region)|>
summarise(avg_by_region = mean(price)) |>
filter(!is.na(avg_by_region))|>
arrange(desc(avg_by_region))
top_10_regions <- head(price_region, 10)
fig_region10 <- plot_ly(
data = top_10_regions,
type = "bar",
x = ~reorder(region, -avg_by_region),
y = ~avg_by_region,
color = ~country,
text = ~paste("Country: ", country, "<br>Avg Price: $", round(avg_by_region, 2)),
marker = list(size = 10)
)
fig_region10 <-
fig_region10 |>
layout(
title = "Top 10 Regions Based on Average Wine Price",
xaxis = list(title = "Region"),
yaxis = list(title = "Average Wine Price"),
showlegend = TRUE
)
fig_region10
price_winery=
wine_rating |>
group_by(country,region, winery)|>
summarise(avg_by_winery = mean(price)) |>
filter(!is.na(avg_by_winery))|>
arrange(desc(avg_by_winery))
price_winery_dis=
wine_rating |>
group_by(country,region,winery)|>
mutate(avg_by_winery = mean(price)) |>
filter(!is.na(avg_by_winery))|>
select(country,region,winery,price,avg_by_winery)|>
arrange(desc(avg_by_winery))
top_10_wineries <- price_winery_dis |>
group_by(avg_by_winery) |>
nest() |>
arrange(desc(avg_by_winery)) |>
head(10) |>
unnest(cols = data)
# Create a Plotly boxplot
boxplot_plotly <- plot_ly(
data = top_10_wineries,
type = "box",
x = ~winery,
y = ~price,
marker = list(color = "blue")
)
# Customize layout
boxplot_plotly <- boxplot_plotly |>
layout(
title = "Price Distribution for Top 10 Wineries",
xaxis = list(title = "Winery"),
yaxis = list(title = "Price"),
showlegend = FALSE
)
# Display the Plotly plot
boxplot_plotly
filtered_price <-
wine_rating |>
select(price)|>
filter(!is.na(price), price > 0)
summary(filtered_price)
## price
## Min. : 3.150
## 1st Qu.: 9.902
## Median : 15.950
## Mean : 33.025
## 3rd Qu.: 32.500
## Max. :3410.790
histogram_plotly_log <- plot_ly(
data = filtered_price,
type = "histogram",
x = ~log(price),
nbinsx = 20, # Adjust the number of bins as needed
marker = list(color = "skyblue")
)
histogram_plotly_log <- histogram_plotly_log |>
layout(
title = "Log-Transformed Price Distribution",
xaxis = list(title = "Log(Price)"),
yaxis = list(title = "Frequency")
)
histogram_plotly_log
price_us=
wine_rating |>
filter(country=="United States")|>
select(country,region, winery,price)
price_us_200=
price_us|>
filter(price<200)
# Create a histogram
histogram_plotly_us <- plot_ly(
data = price_us_200,
type = "histogram",
x = ~price,
nbinsx = 10, # Adjust the number of bins as needed
marker = list(color = "lightblue")
)
# Customize layout
histogram_plotly_us <- histogram_plotly_us |>
layout(
title = "Wine Price Distribution in the US (Filtered under 200)",
xaxis = list(title = "Price"),
yaxis = list(title = "Frequency")
)
# Display the histogram
histogram_plotly_us
ggplot_rp_tf = wine_rating |>
ggplot(aes(x = log(price), y = rating, color = year)) +
geom_point()
ggplotly(ggplot_rp_tf)
lm_rp = lm(rating ~ log(price) + year, data = wine_rating)
resid_rp = wine_rating |>
modelr::add_residuals(lm_rp) |>
ggplot(aes(x = log(price), y = resid)) + geom_point()
ggplotly(resid_rp)
bootstrap_rp = wine_rating |>
modelr::bootstrap(n = 1000) |>
mutate(
models = map(strap, \(df) lm(rating ~ log(price) + year, data = df)),
results = map(models, broom::tidy)) |>
select(results) |>
unnest(results) |>
filter(term == "log(price)") |>
ggplot(aes(x = estimate)) + geom_density()
ggplotly(bootstrap_rp)
#remove missing values and any duplicates
wine_rating =
wine_rating |>
na.omit() |>
distinct() |>
mutate(name = gsub("\\d", "", winery), year = as.numeric(year),
name = iconv(name, from = "", to = "UTF-8", sub = ""),
winery = iconv(winery, from = "", to = "UTF-8", sub = ""),
region = iconv(region, from = "", to = "UTF-8", sub = ""))
# Plotly scatter plot with loess smoothing
plot_ly(wine_rating, x = ~price, y = ~rating, type = 'scatter', mode = 'markers', color = ~categories) %>%
layout(title = "Average wine price by category and production region",
xaxis = list(title = "Price"),
yaxis = list(title = "Rating"))
This scatterplot with loess smoothing generally indicates that higher priced wines have higher ratings for all the four categories of wines.
# #Within Countries, highest rating regions
wine_rating_summary <- wine_rating %>%
group_by(country, region) %>%
filter(number_of_ratings > 2000) %>%
summarise(mean_rating = mean(rating),
sd_rating = sd(rating),
n = n()) %>%
ungroup() %>%
arrange(region, desc(mean_rating)) %>%
top_n(20)
# Plotly bar plot
plot_ly(wine_rating_summary, x = ~reorder(region, -mean_rating), y = ~mean_rating, type = 'bar', color = ~country) %>%
layout(title = "Average wine rating by region (Within countries)",
xaxis = list(title = "Region"),
yaxis = list(title = "Mean Rating"))
Italy by far tops the list of the countries with the highest rated wine producing regions. Out of the top 27 regions in terms of mean-rating, almost half (13) of the wine producing regions are from Italy. The graph only shows the regions from which their wines received more than 2000 ratings.
# Data Preparation
rating_analysis_data <- wine_rating |>
filter(region %in% c("Napa Valley", "Napa County", "California")) |>
filter(categories != "Rose") |>
group_by(region, categories) |>
summarise(mean_rating = mean(rating)) |>
spread(key = categories, value = mean_rating)
# Convert to matrix (required for heatmap)
rating_matrix <- as.matrix(rating_analysis_data[,-1])
rownames(rating_matrix) <- rating_analysis_data$region
# Plotly Heatmap
fig <- plot_ly(x = colnames(rating_matrix),
y = rownames(rating_matrix),
z = rating_matrix,
type = "heatmap",
colorscale = "Viridis") %>%
layout(title = 'Rating Analysis by Wine Category in Napa Valley, Napa County, and California',
xaxis = list(title = 'Wine Category'),
yaxis = list(title = 'Region'))
fig
Narrowing down to the ratings of individual regions and the wines that they produce, this graph compares the mean-ratings of the wines from California, Napa County and Napa Valley. From the graph, the Red wines from Napa Valley, had the highest mean-ratings compared to the other two, just as the White category of wines from Napa County had the highest mean rating. Of the three regions considered, only California produced the Rose category of wines, and incidentally had the lowest mean-rating of the categories.
# Filter data if needed
welcoming_valleys_data <- wine_rating |>
filter(number_of_ratings > 2000) |>
group_by(region, categories) |>
summarise(mean_rating = mean(rating),
number_of_ratings = number_of_ratings,
mean_price = mean(price)) |>
ungroup() |>
filter(mean_rating > 4.3)
# Create a Plotly Bubble Chart
fig <- plot_ly(data = welcoming_valleys_data,
x = ~mean_price,
y = ~mean_rating,
size = ~number_of_ratings,
color = ~region,
colorscale = "Viridis",
type = 'scatter',
mode = 'markers',
marker = list(sizemode = 'diameter', sizeref = 0.7, opacity = 0.4)) %>%
layout(title = 'Most Welcoming Wine Valleys by Wine Category',
xaxis = list(title = 'Mean Price'),
yaxis = list(title = 'Mean Rating'),
colorway = list('region'))
fig
This bubble plot shows the relationship between the mean-prices and mean-ratings of wines, for only the wines that had more than 2000 ratings, and whose mean-rating was greater than 4.3 all grouped by the wine category. The sizes of the bubbles indicate the number of ratings, while the colors indicate the regions. From the plot, only the wine category of wine makes this cut. Again, only seven wine producing regions make this cut, four of them are from Italy.
all_wine_cate = wine_rating |>
filter(categories != "Varieties") |>
group_by(categories) |>
summarise(number = n())
all_wine_cate
## # A tibble: 4 × 2
## categories number
## <chr> <int>
## 1 Red 8666
## 2 Rose 397
## 3 Sparkling 1007
## 4 White 3764
price_plot= wine_rating |>
plot_ly(y = ~price, x= ~categories, color = ~ categories, type = "box", colors = "viridis") |>
layout(yaxis = list(range = c(0, 1500),dtick = 50))
price_plot
rating_plot= wine_rating |>
plot_ly(y = ~rating, x= ~categories, color = ~ categories, type = "box", colors = "viridis") |>
layout(yaxis = list(range = c(0, 5), dtick = 0.2))
rating_plot